Day 1: Sequence analysis using TraMineR - EDA
Day 2: Sequence analysis using TraMineR - Matching, clustering
Day 3: Association rule mining using priori algorithm
Loading libaries
library(readr)
library(ggplot2)
library(data.table)
library(TraMineR)
library(dplyr)
library(tidyr)
library(TraMineR)
data(mvad)
seqstatl(mvad[, 17:86])
[1] "employment" "FE" "HE" "joblessness" "school" "training"
mvad.alphabet <- c("employment", "FE", "HE", "joblessness", "school",
"training")
mvad.labels <- c("employment", "further education", "higher education",
"joblessness", "school", "training")
mvad.scodes <- c("EM", "FE", "HE", "JL", "SC", "TR")
mvad.seq <- seqdef(mvad, 17:86, alphabet = mvad.alphabet, states = mvad.scodes,
labels = mvad.labels, xtstep = 6)
[>] state coding:
[alphabet] [label] [long label]
1 employment EM employment
2 FE FE further education
3 HE HE higher education
4 joblessness JL joblessness
5 school SC school
6 training TR training
[>] 712 sequences in the data set
[>] min/max sequence length: 70/70
seqiplot(mvad.seq, with.legend = FALSE, title= "Index plot (10 first sequences")
[!!] In seqiplot() : title is deprecated, use main instead.


seqfplot(mvad.seq, withlegend = F, title = "Sequence frequency plot bar width proportional to the frequencies")
[!!] In seqfplot() : title is deprecated, use main instead.
[!!] In seqfplot() : withlegend is deprecated, use with.legend instead.

seqdplot(mvad.seq, withlegend = F, title = "State distribution plot")
[!!] In seqdplot() : title is deprecated, use main instead.
[!!] In seqdplot() : withlegend is deprecated, use with.legend instead.

Import log data
lasi21_logdata <- read_csv("lasi21_logdata.csv")
── Column specification ─────────────────────────────────────────────────────────────────────
cols(
id = col_double(),
sas_id_site = col_double(),
site_type = col_character(),
date_time = col_datetime(format = ""),
spent_time = col_double(),
action = col_character(),
instancename = col_character(),
`avg score` = col_double(),
PassFlag = col_double()
)
lasi21_logdata$site_type <- ifelse(grepl("ssignment", lasi21_logdata$instancename), "assignment",lasi21_logdata$site_type )
lasi21_logdata$site_type <- ifelse(grepl("exam", lasi21_logdata$instancename), "assignment",lasi21_logdata$site_type )
lasi21_logdata <- as.data.table(lasi21_logdata[,c("id","date_time","spent_time","site_type","instancename","PassFlag","avg score")])
head(lasi21_logdata)
Data pre-processing
# convert ms to minutes
lasi21_logdata$spent_time_m <- round(lasi21_logdata$spent_time/60000, digits=1)
# filter out all spent_time < 6s
lasi21_logdata2 <- lasi21_logdata %>% filter(spent_time>6000)
# create a flag for session break (the first click per student)
lasi21_logdata2$session_flag = 0
lasi21_logdata2 <- lasi21_logdata2 %>% arrange(id,date_time,spent_time) %>%
group_by(id) %>%
mutate(session_flag = +(row_number() %in% 1))
# create a flag for session break (aka where time spent > 30 minutes)
lasi21_logdata2$session_flag <- ifelse(lasi21_logdata2$spent_time_m>30, 1, lasi21_logdata2$session_flag)
# create session number
lasi21_logdata2$session_num = cumsum(lasi21_logdata2$session_flag)
# remove session break
lasi21_logdata2 <- as.data.table(lasi21_logdata2)
lasi21_logdata2 <- lasi21_logdata2[session_flag!=1,]
# for each learning session, calculate the cummulative time spent
lasi21_logdata2 <- lasi21_logdata2 %>%
arrange(id,date_time,spent_time) %>%
group_by(session_num) %>%
mutate(spent_time_m_cum = cumsum(spent_time_m))
# create time unit as 1/10 of a minute (6s)
lasi21_logdata2$time_unit <- round(lasi21_logdata2$spent_time_m_cum*10,digits=0)
lasi21_logdata_session <- lasi21_logdata2 %>% group_by(session_num) %>% top_n(1, spent_time_m_cum)
hist(lasi21_logdata_session$spent_time_m_cum, main="Learning session length", xlab = "Minutes", breaks=200)

# Remove learning session < 5 mins
lasi21_logdata2 <- merge(lasi21_logdata2,lasi21_logdata_session[lasi21_logdata_session$spent_time_m_cum>5 & lasi21_logdata_session$spent_time_m_cum<120, c("session_num")])
Define sequences
# define sequences columns
log_sts.seq <- seqdef(log_sts,2:1200)
[>] found missing values ('NA') in sequence data
[>] preparing 18602 sequences
[>] coding void elements with '%' and missing values with '*'
[>] 12 distinct states appear in the data:
1 = assignment
2 = collaborate
3 = content
4 = forumng
5 = glossary
6 = homepage
7 = questionnaire
8 = resource
9 = studio
10 = subpage
11 = url
12 = wiki
[>] state coding:
[alphabet] [label] [long label]
1 assignment assignment assignment
2 collaborate collaborate collaborate
3 content content content
4 forumng forumng forumng
5 glossary glossary glossary
6 homepage homepage homepage
7 questionnaire questionnaire questionnaire
8 resource resource resource
9 studio studio studio
10 subpage subpage subpage
11 url url url
12 wiki wiki wiki
[>] 18602 sequences in the data set
[>] min/max sequence length: 51/1199
EDA
# plot the first 10 sequences with length=100
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqiplot(log_sts.seq[611:631,1:100], with.legend = F, main = "Index plot (10 first sequences)")
seqlegend(log_sts.seq)


# State distribution plot
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqdplot(log_sts.seq[,1:200], main = "State distribution plot", with.legend = FALSE)
seqlegend(log_sts.seq)

# Sequence frequency plot
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqfplot(log_sts.seq[,1:200], main = "Sequence frequency plot", with.legend = FALSE, pbarw = TRUE)
seqlegend(log_sts.seq)

# mean time by state
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqmtplot(log_sts.seq[,1:200], main = "Mean time plot", with.legend = FALSE)
seqlegend(log_sts.seq)

# Stability within sequences
# Shannon's entropy as a measure of the diversity of states observed at the considered time point
# It equals 0 when all cases are in the same state (it is thus easy to predict in which state an individual is)
# It is maximum when the cases are equally distributed between the states
seqHtplot(log_sts.seq[,1:200], title = "Entropy index")
[!!] In seqHtplot() : title is deprecated, use main instead.

# Turbulence
Turbulence <- seqST(log_sts.seq[,1:200])
summary(Turbulence)
Turbulence
Min. : 1.000
1st Qu.: 2.000
Median : 4.038
Mean : 4.704
3rd Qu.: 6.641
Max. :32.312
hist(Turbulence, col = "cyan", main = "Sequence turbulence",breaks=50)

# Transitions of events
# define seq transitions
log_sts.seqe <- seqecreate(log_sts.seq[,1:200])
# find frequent subsequences
fsubseq <- seqefsub(log_sts.seqe, pMinSupport = 0.05)
[!!] In seqefsub() : pMinSupport is deprecated, use pmin.support instead.
# plot 15 most frequent subsquences
plot(fsubseq[1:15], col = "cyan", main="Top 15 frequent subsequences")

Sequence similarities and clustering
# Compute sequence distances using OM with transition rate as substitution cost
log_sts.seq.om1 <- round(seqdist(log_sts.seq[1:1000,1:200], method = "OM", indel = 1, sm = "CONSTANT"),2)
[>] 1000 sequences with 12 distinct states
[>] Computing sm with seqcost using CONSTANT
[>] creating 12x12 substitution-cost matrix using 2 as constant value
[>] 846 distinct sequences
[>] min/max sequence lengths: 51/200
[>] computing distances using the OM metric
[>] elapsed time: 28.993 secs


fviz_nbclust(log_sts.seq.om1, FUN = hcut, method = "silhouette")+
labs(subtitle = "Silhouette method")

cluster3 <- cutree(clusterward, k = 3)
cluster3 <- factor(HC3c, labels = paste("Type", 1:3))
table(cluster3)
cluster3
Type 1 Type 2 Type 3
442 291 267
# frequency plot of sequences by cluster
seqfplot(log_sts.seq[1:1000,1:200], group = cluster3, pbarw = T)

seqmtplot(log_sts.seq[1:1000,1:200], group = cluster3)

# layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqdplot(log_sts.seq[1:1000,1:200], group = cluster3, border = NA, with.legend = T)

# seqlegend(log_sts.seq)
covar$grade <- rnorm(1000, mean=70, sd=10); covar$grade <- covar$grade[covar$grade > 1 & covar$grade < 100]
Error in `$<-.data.frame`(`*tmp*`, grade, value = c(72.1121413037559, :
replacement has 998 rows, data has 1000
reglog <- list()
for (i in 1:length(levels(cluster3))) {
reglog[[i]] <- glm((cluster3 == levels(cluster3)[i]) ~ condition + grade + surveymetric1 + surveymetric2, family = "binomial", data = covar)}
tbls <- list()
for (i in 1:length(levels(cluster3))) {tbls[[i]] <- tbl_regression(reglog[[i]], exponentiate = TRUE)}
tbl_merge(
tbls = tbls,
tab_spanner = c("**Type 1**", "**Type 2**", "**Type 3**")
)
| Characteristic |
Type 1
|
Type 2
|
Type 3
|
| OR |
95% CI |
p-value |
OR |
95% CI |
p-value |
OR |
95% CI |
p-value |
| condition |
|
|
|
|
|
|
|
|
|
| Control |
— |
— |
|
— |
— |
|
— |
— |
|
| Treatment |
0.90 |
0.70, 1.16 |
0.4 |
1.15 |
0.87, 1.51 |
0.3 |
0.99 |
0.75, 1.31 |
>0.9 |
| grade |
1.00 |
0.99, 1.02 |
0.5 |
0.99 |
0.98, 1.01 |
0.4 |
1.00 |
0.99, 1.02 |
0.8 |
| surveymetric1 |
0.99 |
0.71, 1.36 |
>0.9 |
0.87 |
0.61, 1.24 |
0.4 |
1.18 |
0.82, 1.69 |
0.4 |
| surveymetric2 |
0.98 |
0.80, 1.19 |
0.8 |
0.99 |
0.80, 1.23 |
>0.9 |
1.04 |
0.83, 1.30 |
0.7 |
Comparison of sequences

---
title: "Temporal and sequential analysis for learning analytics"
author: "Quan Nguyen, Postdoctoral Fellow, School of Information, University of Michigan"
output: html_notebook
---


# Day 1: Sequence analysis using TraMineR - EDA
# Day 2: Sequence analysis using TraMineR - Matching, clustering
# Day 3: Association rule mining using priori algorithm

## Loading libaries

```{r}
library(readr)
library(ggplot2)
library(data.table)
library(TraMineR)
library(dplyr)
library(tidyr)
```

```{r}
library(TraMineR)
data(mvad)
seqstatl(mvad[, 17:86])
mvad.alphabet <- c("employment", "FE", "HE", "joblessness", "school", 
    "training")
mvad.labels <- c("employment", "further education", "higher education", 
    "joblessness", "school", "training")
mvad.scodes <- c("EM", "FE", "HE", "JL", "SC", "TR")
mvad.seq <- seqdef(mvad, 17:86, alphabet = mvad.alphabet, states = mvad.scodes, 
    labels = mvad.labels, xtstep = 6)

```
```{r}
head(mvad)
```

```{r}
seqiplot(mvad.seq, with.legend = FALSE, title= "Index plot (10 first sequences")
```
```{r}
seqIplot(mvad.seq, sortv = "from.start", with.legend = FALSE)
```
```{r}
seqfplot(mvad.seq, withlegend = F, title = "Sequence frequency plot bar width proportional to the frequencies")
```
```{r}
 seqdplot(mvad.seq, withlegend = F,  title = "State distribution plot")
```


# Import log data

```{r}
lasi21_logdata <- read_csv("lasi21_logdata.csv")
lasi21_logdata$site_type <- ifelse(grepl("ssignment", lasi21_logdata$instancename), "assignment",lasi21_logdata$site_type )
lasi21_logdata$site_type <- ifelse(grepl("exam", lasi21_logdata$instancename), "assignment",lasi21_logdata$site_type )

lasi21_logdata <- as.data.table(lasi21_logdata[,c("id","date_time","spent_time","site_type","instancename","PassFlag","avg score")])
head(lasi21_logdata)
```

# Data pre-processing
```{r}
# convert ms to minutes
lasi21_logdata$spent_time_m <- round(lasi21_logdata$spent_time/60000, digits=1)

# filter out all spent_time < 6s
lasi21_logdata2 <- lasi21_logdata %>% filter(spent_time>6000)

# create a flag for session break (the first click per student)
lasi21_logdata2$session_flag = 0
lasi21_logdata2 <- lasi21_logdata2 %>% arrange(id,date_time,spent_time) %>% 
    group_by(id) %>%
    mutate(session_flag  = +(row_number() %in% 1))

# create a flag for session break (aka where time spent > 30 minutes)
lasi21_logdata2$session_flag <- ifelse(lasi21_logdata2$spent_time_m>30, 1, lasi21_logdata2$session_flag)

# create session number 
lasi21_logdata2$session_num = cumsum(lasi21_logdata2$session_flag)

# remove session break
lasi21_logdata2 <- as.data.table(lasi21_logdata2)
lasi21_logdata2 <- lasi21_logdata2[session_flag!=1,]


# for each learning session, calculate the cummulative time spent
lasi21_logdata2 <- lasi21_logdata2 %>% 
                        arrange(id,date_time,spent_time) %>% 
                        group_by(session_num) %>% 
                        mutate(spent_time_m_cum = cumsum(spent_time_m))

# create time unit as 1/10 of a minute (6s)
lasi21_logdata2$time_unit <- round(lasi21_logdata2$spent_time_m_cum*10,digits=0)

lasi21_logdata_session <- lasi21_logdata2 %>% group_by(session_num) %>% top_n(1, spent_time_m_cum)

hist(lasi21_logdata_session$spent_time_m_cum, main="Learning session length", xlab = "Minutes", breaks=200)

# Remove learning session < 5 mins
lasi21_logdata2 <- merge(lasi21_logdata2,lasi21_logdata_session[lasi21_logdata_session$spent_time_m_cum>5 & lasi21_logdata_session$spent_time_m_cum<120, c("session_num")])

```

# Transform data into STS format
```{r}
# create a function to expand time unit for each learning session

f1 <- function(x1){
    x1 <- 1:max(x1)
    m1 <- max(c(length(x1)))
    length(x1) <- m1
    list(time_unit = x1)
}

# create a subset
lasi21_logdata3 <- lasi21_logdata2[,c("session_num","time_unit","site_type")]

# create an expanded df
test <- setDT(lasi21_logdata3)[, f1(time_unit), .(session_num)]

# merge with log data
test <- merge(test,lasi21_logdata3,by=c("session_num","time_unit"),all.x=TRUE)

# fill missing upward
test <- test %>% fill(site_type,.direction = "up")
```



```{r}
# Sequence length distribution
hist(lasi21_logdata3[,max(time_unit), by="session_num"]$V1, breaks=100, main="Length of learning session", xlab='Sequence length')
```

```{r}
# cummulative plot of seq length
plot(ecdf(lasi21_logdata3[,max(time_unit), by="session_num"]$V1), main ='Cummulative distribution of seq length')
```
# Define sequences

```{r}
# convert to STS format (wide format)
log_sts <- spread(test, time_unit, site_type)

# define sequences columns
log_sts.seq <- seqdef(log_sts,2:1200)

```

# EDA

```{r}
# plot the first 10 sequences with length=100
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqiplot(log_sts.seq[611:631,1:100], with.legend = F, main = "Index plot (10 first sequences)")
seqlegend(log_sts.seq)
```
```{r}
# Plot 200 sequences sorted by LCS (longest common subsequence) distance
dist.mostfreq <- seqdist(log_sts.seq, method = "LCS", refseq = 0)
seqIplot(log_sts.seq[1:200,1:200], border = NA, sortv = dist.mostfreq, with.legend=F)
```


```{r}
# State distribution plot
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqdplot(log_sts.seq[,1:200], main = "State distribution plot", with.legend = FALSE)
seqlegend(log_sts.seq)
```

```{r}
# Sequence frequency plot
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqfplot(log_sts.seq[,1:200], main = "Sequence frequency plot", with.legend = FALSE, pbarw = TRUE)
seqlegend(log_sts.seq)
```
```{r}
# mean time by state
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqmtplot(log_sts.seq[,1:200], main = "Mean time plot", with.legend = FALSE)
seqlegend(log_sts.seq)
```

```{r}
# Stability within sequences
# Shannon's entropy as a measure of the diversity of states observed at the considered time point
# It equals 0 when all cases are in the same state  (it is thus easy to predict in which state an individual is)
# It is maximum when the cases are equally distributed between the states
seqHtplot(log_sts.seq[,1:200], title = "Entropy index")
```

```{r}
# Turbulence
Turbulence <- seqST(log_sts.seq[,1:200])
summary(Turbulence)
hist(Turbulence, col = "cyan", main = "Sequence turbulence",breaks=50)

```

```{r}
# Transitions of events

# define seq transitions
log_sts.seqe <- seqecreate(log_sts.seq[,1:200])

# find frequent subsequences
fsubseq <- seqefsub(log_sts.seqe, pMinSupport = 0.05)

# plot 15 most frequent subsquences
plot(fsubseq[1:15], col = "cyan", main="Top 15 frequent subsequences")

```

# Sequence similarities and clustering

```{r}
# Compute sequence distances using OM with transition rate as substitution cost
log_sts.seq.om1 <- round(seqdist(log_sts.seq[1:1000,1:200], method = "OM", indel = 1, sm = "CONSTANT"),2)
```
```{r}
# apply agglomerate hierarchical clustering
library(cluster)
clusterward <- agnes(log_sts.seq.om1, diss = TRUE, method = "ward")
hcd <- as.dendrogram(clusterward)
plot(hcd, which.plots = 2, main="Dendogram of sequences clustering", leaflab = "none")
```
```{r}
library(factoextra)
# Plot scree plot using wss
fviz_nbclust(log_sts.seq.om1, FUN = hcut, method = "wss")
```
```{r}
# Plot Silhouette 
fviz_nbclust(log_sts.seq.om1, FUN = hcut, method = "silhouette")+
labs(subtitle = "Silhouette method")
```
```{r}
# Use k=3 as the number of cluster
cluster3 <- cutree(clusterward, k = 3)
cluster3 <- factor(HC3c, labels = paste("Type", 1:3))
table(cluster3)
```
```{r}
# frequency plot of sequences by cluster
seqfplot(log_sts.seq[1:1000,1:200], group = cluster3, pbarw = T)
```

```{r}
# mean time plot of states by cluster membership
seqmtplot(log_sts.seq[1:1000,1:200], group = cluster3)
```
```{r}
# layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqdplot(log_sts.seq[1:1000,1:200], group = cluster3, border = NA, with.legend = T)
# seqlegend(log_sts.seq)

```

```{r}
# made up covariates for 1000 sequences (e.g., condition, grade, survey_metric1, survey_metric2)
covar <- data.frame(matrix(ncol = 4, nrow = 1000))
colnames(covar) <- c("condition", "grade", "surveymetric1", "surveymetric2")

set.seed(123)
covar$condition <- sample(x=c("Control","Treatment"),1000,prob=c(0.5,0.5),replace=T)

set.seed(123)
covar$grade <- round(rnorm(1000, mean=60, sd=10),0)

set.seed(123)
covar$surveymetric1 <- sample(x=1:5, prob = c(0.1,0.2,0.4,0.2,0.1), replace = T)

set.seed(123)
covar$surveymetric2 <- sample(x=1:5, prob = c(0.1,0.2,0.3,0.3,0.1), replace = T)
```

```{r}
library(gtsummary)

# run logistic regression for each cluster of sequences
reglog <- list()
for (i in 1:length(levels(cluster3))) {reglog[[i]] <- glm((cluster3 == levels(cluster3)[i]) ~ 
                                                            condition + grade + surveymetric1 + surveymetric2, 
                                                            family = "binomial", 
                                                            data = covar)}
# create nice summary output using gtsummary package 
# http://www.danieldsjoberg.com/gtsummary/articles/tbl_regression.html
tbls <- list()
for (i in 1:length(levels(cluster3))) {tbls[[i]] <- tbl_regression(reglog[[i]], exponentiate = TRUE)}

tbl_merge(
    tbls = tbls,
    tab_spanner = c("**Type 1**", "**Type 2**", "**Type 3**")
  )
```

# Comparison of sequences

```{r}
# Plot sequences by group
dist.refseq <- seqdist(log_sts.seq[1:1000,1:200], refseq = 0, method = "LCS")

seqIplot(log_sts.seq[1:1000,1:200], group = covar$condition, sortv = dist.refseq, with.legend = "right")
```



